library(rio)
library(quanteda)
library(ggplot2)
library(reshape2)
library(tidyverse)
library(tidytext)
library(texreg)
library(AER)
library(sampleSelection)
library(survival)
library(ggfortify)
library(survminer)
library(ggeffects)
library(retrodesign)
library(MatchIt)
library(cobalt)
library(modelsummary)

##set your working directory
setwd("~/Downloads/")

##load the dataset
survey <- readRDS("CautionaryTaleData.RDS")


#################
#TABLE 1
#################

balance_data <- 
  survey %>% 
  dplyr::select(Openn:Neuro, Finished, Fatigue, gender, race, educ, age, income) %>% 
  na.omit() %>%
  mutate(Finished = case_when(Finished == TRUE ~ 1, TRUE ~ 0)) %>%
  model.matrix( ~ 1 + Finished + Openn + Consc + Extra + Agree + Neuro + Fatigue + I(gender == "Female") + 
                  I(race == "Black") + I(race == "Hispanic or Latino") + I(!(race %in% c("White","Black","Hispanic"))) +
                  age + I(educ == "Did not graduate from high school") + I(educ == "High school graduate") +
                  I(educ == "2-year college degree") + I(educ == "4-year college degree") + 
                  I(educ == "Postgraduate degree (MA, MBA, MD, JD, PhD, etc.)") + 
                  income + I(income == 8), data = .) %>%
  as.data.frame()

coef_labels <-
  c("Finished", "Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Neuroticism", 
    "Fatigue (logged duration)",
    "Female", "Black", "Hispanic", "Other Race", "Age",
    "Education < HS", "Education = HS", "Education = 2 yr. Coll.", "Education = 4 yr. Coll.",
    "Education = Postgrad. Degree", "Income", "Income Refused")


colnames(balance_data)[-1] <- coef_labels

balance_data <- 
  balance_data %>%
  mutate(Finished = case_when(Finished == 1 ~ "Good Completes", Finished == 0 ~ "Bad Completes"))

datasummary_balance(~Finished,
                    data = balance_data[,-1], 
                    stars = TRUE,  
                    vcov = "robust",
                    fmt = fmt_decimal(digits = 2, pdigits = 3),
                    output = "latex_tabular"
)



#################
#TABLE A-2
#################

mod_compl <- glm(Finished ~ scale(Openn) + scale(Consc) + scale(Extra) + scale(Agree) + scale(Neuro), 
                 data = survey, family = binomial())

mod_compl_full <- glm(Finished ~ scale(Openn) + scale(Consc) + scale(Extra) + scale(Agree) + scale(Neuro) + Fatigue +
                        I(gender == "Female") + 
                        I(race == "Black") + I(race == "Hispanic or Latino") + I(!(race %in% c("White","Black","Hispanic"))) +
                        age + I(age^2/100) + I(educ == "Did not graduate from high school") + I(educ == "High school graduate") +
                        I(educ == "2-year college degree") + I(educ == "4-year college degree") + 
                        I(educ == "Postgraduate degree (MA, MBA, MD, JD, PhD, etc.)") + 
                        income + I(income == 8),
                      data = survey, family = binomial())

texreg(list(mod_compl, mod_compl_full), omit.coef = "factor", 
          custom.coef.names = c("Intercept", "Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Neuroticism", 
                                "Fatigue",
                                "Female", "Black", "Hispanic", "Other Race", "Age", "Age Sq. (/100)", 
                                "Education < HS", "Education = HS", "Education = 2 yr. Coll.", "Education = 4 yr. Coll.",
                                "Education = Postgrad. Degree", "Income", "Income Refused"))


#prep for figure 1
compl_res <- as.data.frame(rbind(summary(mod_compl)$coef[2:6,1:2],
                                 summary(mod_compl_full)$coef[2:6,1:2]),
                           row.names = FALSE)

compl_res$Trait <- rep(c("Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Neuroticism"), 2)
compl_res$Model <- rep(c("Traits Alone", "Traits with Demographics"), each = 5)
compl_res$lower <- compl_res$Estimate - 1.96*compl_res$`Std. Error`
compl_res$upper <- compl_res$Estimate + 1.96*compl_res$`Std. Error`

compl_res$Sig <- "no"
compl_res$Sig[(compl_res$lower * compl_res$upper > 0)] <- "yes"

plot_completes <-
  ggplot(compl_res,
         aes(factor(Trait, 
                     levels = rev(c("Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Neuroticism"))), 
             Estimate, shape = Model, linetype = Model)) +
  geom_point(position = position_dodge(width=0.9), size = 3) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = .1, position = position_dodge(width=0.9)) +
  coord_flip() +
  scale_x_discrete(labels = function(x) gsub(" .*", "", x)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  ylab("Coefficient Estimate\n(Standardized)") + 
  xlab("") + 
  theme_bw() +
  theme(axis.text.y=element_text(size=20), axis.text.x=element_text(size=15, angle=45, hjust=1), 
        axis.title.x=element_text(size=20), axis.title.y=element_text(size=20),
        legend.text = element_text(size=20), legend.title = element_text(size=20), legend.position = "bottom")

#################
#FIGURE 1
#################
ggsave(plot_completes, file = paste0(getwd(), "/Figure1.pdf"), 
       height = 7, width = 17)


#prep for Figure 2

trait <- "Neuro"
survive_est <- function(trait = "Neuro"){
  
  survival_model <- survfit(Surv(Progress, status) ~ survey[,substring(trait,1,1)],
                            data=survey)
  survival_model$trait <- trait
  
  ggsurv <- ggsurvplot(
    survival_model,                     # survfit object with calculated statistics.
    data = survey,             # data used to fit survival curves.
    size = 1,                 # change line size
    palette = 
      c("grey0", "grey45"),# custom color palettes
    conf.int = TRUE,          # Add confidence interval
    pval = TRUE,              # Add p-value
    risk.table = FALSE,        # Add risk table
    legend.labs = 
      paste0(c("Below Average ","Above Average "), survival_model$trait),    # Change legend labels
    risk.table.height = 0.25, # Useful to change when you have multiple groups
    ylim = c(0.7, 1.0),
    xlim = c(30,100),
    xlab = "Survey Progress",
    ggtheme = theme_bw()      # Change ggplot2 theme
  )
  
  pdf(paste0(getwd(), "/Figure2_", trait,"_survival.pdf"))
  print(ggsurv, newpage = FALSE)
  dev.off()
}

#################
#FIGURE 2
#################

survive_est(trait = "Agree")
survive_est(trait = "Neuro")

#define a regression function to loop over
reg_fun <- function(var){
  
  survey$depvar <- scale(survey[,var])
  for (var in c("Openn", "Consc", "Extra", "Agree", "Neuro")) survey[,var] <- scale(survey[,var])
  
  mod_ideo_full <- lm(depvar ~ Openn + Consc + Extra + Agree + Neuro + Fatigue + I(gender == "Female") + 
                        I(race == "Black") + I(race == "Hispanic or Latino") + I(!(race %in% c("White","Black","Hispanic"))) +
                        age + I(age^2/100) + I(educ == "Did not graduate from high school") + I(educ == "High school graduate") +
                        I(educ == "2-year college degree") + I(educ == "4-year college degree") + 
                        I(educ == "Postgraduate degree (MA, MBA, MD, JD, PhD, etc.)") + 
                        income + I(income == 8),
                      data = survey)
  
  mod_ideo_compl <- lm(depvar ~ Openn + Consc + Extra + Agree + Neuro + Fatigue + I(gender == "Female") + 
                         I(race == "Black") + I(race == "Hispanic or Latino") + I(!(race %in% c("White","Black","Hispanic"))) +
                         age + I(age^2/100) + I(educ == "Did not graduate from high school") + I(educ == "High school graduate") +
                         I(educ == "2-year college degree") + I(educ == "4-year college degree") + 
                         I(educ == "Postgraduate degree (MA, MBA, MD, JD, PhD, etc.)") + 
                         income + I(income == 8), 
                       data = subset(survey, Finished == TRUE))
  
  mod_ideo_noncompl <- lm(depvar ~ Openn + Consc + Extra + Agree + Neuro + Fatigue + I(gender == "Female") + 
                            I(race == "Black") + I(race == "Hispanic or Latino") + I(!(race %in% c("White","Black","Hispanic"))) +
                            age + I(age^2/100) + I(educ == "Did not graduate from high school") + I(educ == "High school graduate") +
                            I(educ == "2-year college degree") + I(educ == "4-year college degree") + 
                            I(educ == "Postgraduate degree (MA, MBA, MD, JD, PhD, etc.)") + 
                            income + I(income == 8),
                          data = subset(survey, Finished == FALSE))
  
  res <- as.data.frame(rbind(summary(mod_ideo_full)$coef[2:6,1:2],
                             summary(mod_ideo_compl)$coef[2:6,1:2],
                             summary(mod_ideo_noncompl)$coef[2:6,1:2]), row.names = FALSE)
  
  res <- cbind(res, rbind(confint(mod_ideo_full)[2:6,], confint(mod_ideo_compl)[2:6,], confint(mod_ideo_noncompl)[2:6,]))
  colnames(res)[3:4] <- c("lower", "upper")
  res$Trait <- rep(c("Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Neuroticism"), 3)
  res$Model <- rep(c("All Data", "Good Completes", "Bad Completes"), each = 5)
  
  return(list(res, mod_ideo_full, mod_ideo_compl, mod_ideo_noncompl))
}

#get ready for Figure 3

pars <- c("pid7_rev", "Liberalism", "vote_lib_cands", "believe_relig",
          "stand_natl_anthem", "letters", "news", "talkpolitics", "knowledge", "church_attend")

pars_full <- c("Democratic Party ID", "Liberalism", "Vote for Lib. Cands.", "One True Religion",
               "Stand for Natl. Anthem", "Write Letters to Editor",
               "Attention to Politics", "Days Talking Politics", "Political Knowledge",
               "Church Attendance")


all_results <- do.call(rbind, lapply(pars, function(x) reg_fun(x)[[1]]))
all_results$DV <- rep(pars_full, each = 3*5)

all_results$Sig <- "no"
all_results$Sig[(all_results$lower * all_results$upper > 0)] <- "yes"

#################
#FIGURE 3
#################

plot_mods_rev <-
  all_results %>% 
  filter(Model != "All Data") %>% 
  group_by(DV, Trait) %>% 
  summarise(EstDiff = Estimate[Model == "Good Completes"] - Estimate[Model == "Bad Completes"],
            SeDiff = sqrt(`Std. Error`[Model == "Good Completes"]^2 + `Std. Error`[Model == "Bad Completes"]^2),
            Tstat = EstDiff/SeDiff) %>%
  ungroup() %>%
  ggplot(aes(factor(Trait, 
                    levels = rev(c("Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Neuroticism"))), 
             EstDiff)) +
  geom_point(position = position_dodge(width=0.9)) +
  geom_errorbar(aes(ymin = EstDiff - 1.96*SeDiff, ymax = EstDiff + 1.96*SeDiff), 
                width = .1, position = position_dodge(width=0.9)) +
  facet_wrap(~ DV, scales = "fixed", nrow = 5, ncol = 2, labeller = labeller(DV = label_wrap_gen(30))) +
  coord_flip() +
  scale_x_discrete(labels = function(x) gsub(" .*", "", x)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  ylab("Over/Under-statement of Effect of Personality Trait on Outcome") + 
  xlab("") + 
  # scale_colour_grey(name = "Statistically Significant (p < 0.05)", labels = c("No", "Yes"), start = 0.5, end = 0.0) + 
  theme_bw() +
  theme(axis.text.y=element_text(size=20), axis.text.x=element_text(size=15, angle=45, hjust=1), 
        axis.title.x=element_text(size=20), axis.title.y=element_text(size=20),
        legend.text = element_text(size=20), legend.title = element_text(size=20), 
        strip.text = element_text(size = 20), legend.position = "bottom")


ggsave(plot_mods_rev, file = paste0(getwd(), "/Figure3.pdf"),
       height = 17, width = 14)


##re-make in tabular form

all_regs <- lapply(pars, function(x) list(reg_fun(x)[[3]], reg_fun(x)[[4]]))
all_regs_rev <- NULL
i <- 1
for (k in 1:length(all_regs)){
  all_regs_rev[[i]] <- all_regs[[k]][[1]]
  all_regs_rev[[i+1]] <- all_regs[[k]][[2]]
  i <- i + 2
}

#################
#TABLE A-3
#################

texreg(all_regs_rev[1:10], omit.coef = "factor", 
          custom.coef.names = c("Intercept", "Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Neuroticism",
                                "Fatigue",
                                "Female", "Black", "Hispanic", "Other Race", "Age", "Age Sq. (/100)", 
                                "Education < HS", "Education = HS", "Education = 2 yr. Coll.", "Education = 4 yr. Coll.",
                                "Education = Postgrad. Degree", "Income", "Income Refused"),
          custom.model.names = rep(pars_full[1:5], each = 2))

#################
#TABLE A-4
#################

texreg(all_regs_rev[11:20], omit.coef = "factor", 
          custom.coef.names = c("Intercept", "Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Neuroticism",
                                "Fatigue",
                                "Female", "Black", "Hispanic", "Other Race", "Age", "Age Sq. (/100)", 
                                "Education < HS", "Education = HS", "Education = 2 yr. Coll.", "Education = 4 yr. Coll.",
                                "Education = Postgrad. Degree", "Income", "Income Refused"),
          custom.model.names = rep(pars_full[6:10], each = 2))



##we will look at the R2 performance across models

#################
#FIGURE 4
#################

R2diff <- 
  ggplot(
  data.frame(R2nonc = sapply(pars, function(x){
    for (var in c("Openn", "Consc", "Extra", "Agree", "Neuro")) survey[,var] <- scale(survey[,var])
    preds <- predict(reg_fun(x)[[3]], newdata = subset(survey, Finished == FALSE))
    y <- scale(survey[,x])[survey$Finished == FALSE]
    ybar <- mean(y, na.rm = TRUE)
    return(1 - sum((y-preds)^2, na.rm = TRUE)/sum((y-ybar)^2, na.rm = TRUE))
  }
  ),
  R2compl = 
    sapply(pars, function(x){
      for (var in c("Openn", "Consc", "Extra", "Agree", "Neuro")) survey[,var] <- scale(survey[,var])
      preds <- predict(reg_fun(x)[[3]], newdata = subset(survey, Finished == TRUE))
      y <- scale(survey[,x])[survey$Finished == TRUE]
      ybar <- mean(y, na.rm = TRUE)
      return(1 - sum((y-preds)^2, na.rm = TRUE)/sum((y-ybar)^2, na.rm = TRUE))
    }
    ),
  par = pars_full) %>% 
    pivot_longer(cols = R2nonc:R2compl) %>% 
    mutate(name = case_when(name == "R2nonc" ~ "Bad Completes", TRUE ~ "Good Completes")) %>%
    rename("Data Source" = "name"), 
  aes(fct_reorder(par, value), value, colour = `Data Source`)) + 
  geom_point(size = 5) + 
  coord_flip() + 
  ylab(expression(R^2)) + 
  xlab("") + theme_minimal() + 
  geom_hline(yintercept = 0, linetype = "dashed") +
  theme_bw() +
  theme(axis.text.y=element_text(size=20), axis.text.x=element_text(size=15, angle=45, hjust=1), 
        axis.title.x=element_text(size=20), axis.title.y=element_text(size=20),
        legend.text = element_text(size=20), legend.title = element_text(size=20), legend.position = "bottom")


ggsave(R2diff, file = paste0(getwd(), "/Figure4.pdf"), 
       height = 7, width = 17)


